{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "fad6ee3f-45b8-4ac3-aa39-4a44dac91994",
   "metadata": {},
   "source": [
    "## Creating Text Embeddings From a Text File\n",
    "- Loading data using TextLoader\n",
    "- Splitting into chunks using CharacterTextSplitter\n",
    "- Converting chunks into vector embeddings and creating a vectorstore\n",
    "- Retreiving, reducing dimensions to 2D and displaying text embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33b79f0d-7bd5-4e82-9295-2cc5cfa9495b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "from dotenv import load_dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "391d12b3-ea25-4c66-93ba-71ef7c590be3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import DirectoryLoader, TextLoader\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.schema import Document\n",
    "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "from langchain_chroma import Chroma\n",
    "import numpy as np\n",
    "from sklearn.manifold import TSNE\n",
    "import plotly.graph_objects as go"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "365d4346-bcf7-48b3-be13-b492f1877fab",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL = \"gpt-4o-mini\"\n",
    "db_name = \"my_vector_db\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93887c1e-fb5e-4f9a-95f6-91a284e49695",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv(override=True)\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86289eb8-25d8-405f-b1bb-3d9d9fed8671",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = TextLoader(\"data.txt\", encoding=\"utf-8\")\n",
    "data = loader.load()\n",
    "\n",
    "documents = []\n",
    "for text in data:\n",
    "    documents.append(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32320fff-2321-40ea-9b7d-294dc2dfba3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "text_splitter = CharacterTextSplitter(chunk_size=20, chunk_overlap=5)\n",
    "chunks = text_splitter.split_documents(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fce762a5-4c78-4102-ab55-f95ee0c97286",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(chunks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddb5bc12-af30-476d-bbbb-f91a3ae8af2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = OpenAIEmbeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75ba81ec-9178-4ce4-83e2-82f937c85902",
   "metadata": {},
   "outputs": [],
   "source": [
    "if os.path.exists(db_name):\n",
    "    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3ca2632-a8b3-4e7e-8370-d91579d31c23",
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n",
    "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0de67066-73f5-446f-9033-a00d45b0cdc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get one vector and find how many dimensions it has\n",
    "\n",
    "collection = vectorstore._collection\n",
    "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]  # represents a single vector\n",
    "dimensions = len(sample_embedding)\n",
    "print(f\"The vectors have {dimensions:,} dimensions\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e50d972c-d740-4f0a-8bc2-e55ebe462a41",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa96105d-b882-48d9-b088-6aab5db7b1e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = collection.get(include=['embeddings','documents'])\n",
    "vectors = np.array(result['embeddings'])  \n",
    "documents = result['documents']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "213b4cf2-db0a-4610-8d8f-97607996ed17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reduce dimensionality to 2D using t-SNE\n",
    "tsne = TSNE(n_components=2,perplexity=5, random_state=42)\n",
    "reduced_vectors = tsne.fit_transform(vectors)\n",
    "\n",
    "# Create the 2D scatter plot\n",
    "fig = go.Figure(data=[go.Scatter(\n",
    "    x=reduced_vectors[:, 0],\n",
    "    y=reduced_vectors[:, 1],\n",
    "    mode='markers',\n",
    "    marker=dict(size=5, opacity=0.8),\n",
    "    text=[f\"Text: {d[:200]}...\" for d in documents],\n",
    "    hoverinfo='text'\n",
    ")])\n",
    "\n",
    "fig.update_layout(\n",
    "    title='2D Chroma Vector Store Visualization',\n",
    "    scene=dict(xaxis_title='x',yaxis_title='y'),\n",
    "    width=800,\n",
    "    height=600,\n",
    "    margin=dict(r=20, b=10, l=10, t=40)\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d13aa60-da3e-4c61-af69-1ba9087e0181",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
